Assembling assays into a master MultiAssayExperiment object

Author

Laura Symul, Laura Vermeren

Published

February 27, 2026

Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
library(MultiAssayExperiment)

tmp <- fs::dir_map("R scripts/", source)

theme_set(theme_light())

The purpose of this document is to create a MultiAssayExperiment object containing all assays and clinical data generated in the context of the VIBRANT trial at the participant x visit resolution (which means that duplicates were aggregated in previous steps such that for a given assay/variable, there is a single value for each participant and visit combination).

The document is organised as follow:

At the end, we document which assays are available (= have data) for which participants and visits.

Clinical data (CRF data)

CRF data (raw and QCed)

Code
crf_files <- 
  get_01_output_dir() |> 
  fs::dir_ls() |> 
  str_subset("/01_")

crf_clean_file <- crf_files |> str_subset("01_crf_clean") |> sort(decreasing = TRUE) |> magrittr::extract(1)
crf_raw_file <- crf_files |> str_subset("01_crf_raw") |> sort(decreasing = TRUE) |> magrittr::extract(1)
crf_dict_file <- crf_files |> str_subset("01_crf_plates_dictionary") |> sort(decreasing = TRUE) |> magrittr::extract(1)

load(crf_clean_file, verbose = TRUE)
Loading objects:
  crf_clean
Code
load(crf_raw_file, verbose = TRUE)
Loading objects:
  crf_raw
Code
load(crf_dict_file, verbose = TRUE)
Loading objects:
  crf_plates_dictionary
Code
rm(crf_clean_file, crf_raw_file, crf_dict_file)

Participants table

Code
load(
  crf_files |> str_subset("01_participants_2025") |> sort(decreasing = TRUE) |> magrittr::extract(1),
  verbose = TRUE
  )
Loading objects:
  participants
Code
load(
  crf_files |> str_subset("01_participants_variable_dictionary") |> sort(decreasing = TRUE) |> magrittr::extract(1),
  verbose = TRUE
  )
Loading objects:
  participants_variable_dictionary
Code
load(
  crf_files |> str_subset("01_participant_crfs_merged") |> sort(decreasing = TRUE) |> magrittr::extract(1),
  verbose = TRUE
  )
Loading objects:
  participant_crfs_merged

Visits table

Code
load(
  crf_files |> str_subset("01_visits_2025") |> sort(decreasing = TRUE) |> magrittr::extract(1),
  verbose = TRUE
  )
Loading objects:
  visits
Code
load(
  crf_files |> str_subset("01_visits_long_2025") |> sort(decreasing = TRUE) |> magrittr::extract(1),
  verbose = TRUE
  )
Loading objects:
  visits_long
Code
load(
  crf_files |> str_subset("01_visits_crfs_merged") |> sort(decreasing = TRUE) |> magrittr::extract(1),
  verbose = TRUE
  )
Loading objects:
  visits_crfs_merged
Code
visits |> 
  ggplot() +
  aes(x = study_day, y = pid |> factor(), color = visit_code |> factor()) +
  geom_point() 

We define the uid (unique identifier):

Code
visits <- 
  visits |> 
  mutate(
    uid = str_c(pid, visit_code, sep = "_")
  ) |> 
  select(uid, everything())

Exposures table

Code
load(
  crf_files |> str_subset("01_exposures_") |> sort(decreasing = TRUE) |> magrittr::extract(1),
  verbose = TRUE
  )
Loading objects:
  exposures

Assays: loading the SummarizedExperiment objects

Code
data_dir <- get_01_output_dir()
se_files <- fs::dir_ls(data_dir, regexp = "se_.*\\.rds$") |> sort(decreasing = TRUE)

Metagenomics data

Code
mg_file <- se_files[grepl("mg", se_files)][1]
se_mg <- readRDS(mg_file)
se_mg <- check_se(se_mg)
se_mg
class: SummarizedExperiment 
dim: 779 996 
metadata(3): name date dictionary
assays(5): count count_corr rel_ab_all rel_ab_bact rel_ab
rownames(779): C0022A1 C0059E1 ... Mycoplasma_arginini
  Streptococcus_virus
rowData names(16): taxon_id taxon_label ... strain_origin biose_id
colnames(996): 068100004_0000 068100004_1000 ... EQ05822515 MC1
colData names(35): uid mg_uid ... notes reextraction_status

qPCR data

Code
qPCR_file <- se_files[grepl("03_se_pcr_agg", se_files)][1]
se_qPRC <- readRDS(qPCR_file)
se_qPRC <- check_se(se_qPRC)
se_qPRC
class: SummarizedExperiment 
dim: 16 1029 
metadata(3): description date assay_and_coldata_dictionary
assays(8): n_non_na dilution ... copies_per_swab_LL3_med
  copies_per_swab_LM_med
rownames(16): C0175A1 C0022A1 ... FF00072 16S
rowData names(9): target fluor ... biose_id pcr_plate_ids
colnames(1029): 068100004_0000 068100004_1000 ... EQ05822515 EQ05822516
colData names(7): uid qpcr_sample_type ... vibr_sample_id
  ext_lib_plate_nb

qPCR data for daily swabs (generated post-unblinding)

Code
daily_qPCR_file <- se_files[grepl("30_se_pcr_agg", se_files)][1]
se_daily_qPRC <- readRDS(daily_qPCR_file)
se_daily_qPRC <- check_se(se_daily_qPRC)
se_daily_qPRC
class: SummarizedExperiment 
dim: 16 2990 
metadata(3): description date assay_and_coldata_dictionary
assays(5): n_non_na dilution copies_per_swab_med copies_per_swab_mean
  copies_per_swab_cv
rownames(16): C0175A1 C0022A1 ... FF00072 16S
rowData names(8): target fluor ... strain_origin biose_id
colnames(2990): 068100004_1001 068100004_1002 ... 068200465_1406
  068200465_1407
colData names(9): uid qpcr_sample_type ... pcr_plate_nb
  ext_lib_plate_nb

16S rRNA amplicon sequencing

Code
amplicon_file <- se_files[grepl("16S_agg", se_files)][1]
se_16S <- readRDS(amplicon_file)
se_16S <- check_se(se_16S)
se_16S
class: SummarizedExperiment 
dim: 817 4119 
metadata(4): name date dictionary asv_tax_table
assays(2): counts rel_ab
rownames(817): Lactobacillus_iners Lactobacillus_crispatus ...
  Oliverpabstia_sp000432335 Corynebacterium_ulcerans
rowData names(12): taxon_id taxon_label ... asv_ids
  last_available_taxonomic_level
colnames(4119): 068100004_0000 068100004_1000 ...
  Nuclease_free_water_plate1_Pool10 Unusedswab_C2_plate1_Pool10
colData names(32): uid exclude_sample ... ext_lib_plate_nb resequenced

Luminex data

Code
luminex_file <- se_files[grepl("luminex_agg", se_files)][1]
se_luminex <- readRDS(luminex_file)
se_luminex <- check_se(se_luminex)
se_luminex
class: SummarizedExperiment 
dim: 48 850 
metadata(7): n_plates names ... extremes se_creation_date
assays(7): obs_conc obs_conc_imp ... obs_conc_log10 obs_conc_imp_log10
rownames(48): CTACK Eotaxin ... VEGF b-NGF
rowData names(3): prop_OOR_per_analyte exclude_analyte
  exclude_analyte_reason
colnames(850): 068100004_0000 068100004_1000 ... 068200465_1900
  068200465_2120
colData names(25): uid sample_id ... visit_code_assay location_assay

Flow cytometry data

Code
flow_file <- se_files[grepl("flow", se_files)][1]
se_flow <- readRDS(flow_file)
se_flow <- check_se(se_flow)
se_flow
class: SummarizedExperiment 
dim: 11 356 
metadata(1): gate_mapping
assays(2): count percentage
rownames(11): Live Granulocytes ... DblePosCD4 DblePosCD8
rowData names(3): cell_type parent_cell_type cell_type_label
colnames(356): 068100004_1000 068100004_1100 ... 068200465_1500
  068200465_2120
colData names(5): uid sample machine sample_type control_type

Creating the MultiAssayExperiment object

Assay list

Code
assay_list <- 
  bind_rows(
    tibble(
      mae_assay_name = "mg", 
      se_object_name = "se_mg", 
      assay_label = "Metagenomic"
      ),
    tibble(
      mae_assay_name = "qPCR", 
      se_object_name = "se_qPRC", 
      assay_label = "qPCR weekly swabs"
      ),
    tibble(
      mae_assay_name = "qPCR_daily", 
      se_object_name = "se_daily_qPRC", 
      assay_label = "qPCR home swabs"
      ),
    tibble(
      mae_assay_name = "amplicon", 
      se_object_name = "se_16S", 
      assay_label = "16S rRNA amplicon seq."
      ),
    tibble(
      mae_assay_name = "luminex", 
      se_object_name = "se_luminex", 
      assay_label = "Luminex"
      ),
    tibble(
      mae_assay_name = "flow", 
      se_object_name = "se_flow", 
      assay_label = "Flow cytometry"
      )
  ) |> 
  mutate(
    assay_label = assay_label |> fct_inorder()
  )

We will assemble the following tables (SE objects):

Code
assay_list |> gt()
mae_assay_name se_object_name assay_label
mg se_mg Metagenomic
qPCR se_qPRC qPCR weekly swabs
qPCR_daily se_daily_qPRC qPCR home swabs
amplicon se_16S 16S rRNA amplicon seq.
luminex se_luminex Luminex
flow se_flow Flow cytometry

mae_coldata_assays tables

Code
available_samples <- 
  map(
    1:nrow(assay_list),
    function(i, assay_list){
      se <- get(assay_list$se_object_name[i])
      tibble(
        assay = assay_list$mae_assay_name[i],
        se_colname = se |> colnames(), 
        uid = se$uid, 
        sample_type = se$sample_type,
        control_type = se$control_type,
        )
    },
    assay_list = assay_list
  ) |> 
  bind_rows()

if (!all(available_samples$se_colname == available_samples$uid, na.rm = TRUE)) {
  stop("The uid column in the SE objects does not match their colnames")
}

mae_coldata_assays <-
  available_samples |>
  group_by(uid) |>
  summarize(
    n_sample_type = n_distinct(sample_type),
    n_control_type = n_distinct(control_type),
    sample_type = sample_type[1],
    control_type = control_type[1],
    assays = str_c(assay, collapse = ", ")
  )

if (any(mae_coldata_assays$n_sample_type > 1)) {
  warning("Some samples have multiple sample types")
}

if (any(mae_coldata_assays$n_control_type > 1)) {
  warning("Some samples have multiple control types")
}

mae_coldata_assays <- 
  mae_coldata_assays |> 
  select(uid, sample_type, control_type, assays) 

The mae_coldata_assays table contains the following columns:

Code
mae_coldata_assays |> colnames()
[1] "uid"          "sample_type"  "control_type" "assays"      
Code
mae_coldata_assays |> str()
tibble [4,185 × 4] (S3: tbl_df/tbl/data.frame)
 $ uid         : chr [1:4185] "068100004_0000" "068100004_1000" "068100004_1001" "068100004_1002" ...
 $ sample_type : chr [1:4185] "Clinical sample" "Clinical sample" "Clinical sample" "Clinical sample" ...
 $ control_type: chr [1:4185] "" "" "" "" ...
 $ assays      : chr [1:4185] "mg, qPCR, amplicon, luminex" "mg, qPCR, amplicon, luminex, flow" "qPCR_daily, amplicon" "qPCR_daily, amplicon" ...
Code
mae_coldata_assays |> 
  dplyr::count(sample_type, control_type) |> 
  gt(caption = "Number of samples per sample type and control type") 
Number of samples per sample type and control type
sample_type control_type n
Clinical sample 3962
Clinical sample (other study) 7
Negative control Nuclease-free water 46
Negative control Unused swab + C2 47
Positive control Mock 1 57
Positive control Mock 2 57
Test sample 9

We also remove the columns sample_type and control_type from the colData of the individual assays to avoid conflicts when joining with the mae@colData table in downstream analyses.

Code
for (i in 1:nrow(assay_list)) {
  se <- get(assay_list$se_object_name[i])
  if (("sample_type" %in% colnames(colData(se))) | ("control_type" %in% colnames(colData(se)))) {
    colData(se) <- colData(se)[, !colnames(colData(se)) %in% c("sample_type", "control_type")]
  }
  assign(assay_list$se_object_name[i], se)
}

MAE@colData: joining the visits and mae_coldata_assays tables

We first do a full join of the visits and mae_coldata_assays tables by uid. This will allow us to keep all visits, even those that do not have assay data, and to identify which visits are in the CRF data and which are in the assay data.

Code
mae_coldata <- 
  # we first do a full join of the CRF visits and mae_coldata_assays by uids;
  dplyr::full_join(
    visits |> select(-starts_with("specimen_collection")) |> mutate(in_CRF = TRUE),
    mae_coldata_assays |> mutate(in_assays = TRUE) |> select(in_assays, everything()),
    by = join_by(uid)
  ) |> 
  mutate(
    in_CRF = in_CRF |> replace_na(FALSE),
    in_assays = in_assays |> replace_na(FALSE)
  )
Code
# We notice that there are 5 "Clinical sample" entries for which we do have assay data but that did not appear in the CRF data:

mae_coldata |> 
  filter(is.na(visit_code), sample_type == "Clinical sample") 
  
# we fix these below
Code
mae_coldata <- 
  mae_coldata |> 
  mutate(
    needs_fixing = is.na(pid) & (sample_type == "Clinical sample") & str_detect(uid, "[0-9]{9}_[0-9]{4}"),
    pid = 
      case_when(
        needs_fixing ~ str_remove(uid, "_[0-9]{4}"),
        TRUE ~ pid
      ),
    visit_code =
      case_when(
        needs_fixing ~ str_remove(uid, "[0-9]{9}_"),
        TRUE ~ visit_code
      ),
    study_day = 
      case_when(
        needs_fixing & (visit_code %in% c("0001", "0011")) ~ -7,
        needs_fixing & in_assays & (visit_code == "1000") ~ 0,
        TRUE ~ study_day
      ),
    visit_attended = 
      case_when(
        is.na(visit_attended) & needs_fixing ~ TRUE,
        TRUE ~ visit_attended
      ),
    visit_planned = 
      case_when(
        is.na(visit_planned) & needs_fixing ~ "Unplanned visit",
        TRUE ~ visit_planned
      ),
    visit_type = 
      case_when(
        is.na(visit_type) & needs_fixing ~ "Clinic",
        TRUE ~ visit_type
      )
  ) 

We then merge that table with the participants table to add the participant-invariant information (e.g., randomized, mITT, etc.) to the mae_coldata table.

Code
# we add the participant-invariant information from the `participants` table
mae_coldata <- 
  mae_coldata |> 
  dplyr::left_join(
    participants,
    by = join_by(pid)
  )
Code
mae_coldata <- mae_coldata |> select(-needs_fixing)
Code
mae_coldata |> 
  dplyr::count(in_CRF, randomized, in_assays, sample_type, assays) |> 
  gt()
in_CRF randomized in_assays sample_type assays n
FALSE FALSE TRUE Clinical sample mg, qPCR, amplicon, flow 2
FALSE FALSE TRUE Clinical sample qPCR_daily, amplicon 1
FALSE NA TRUE Clinical sample (other study) mg, qPCR 7
FALSE NA TRUE Negative control amplicon 70
FALSE NA TRUE Negative control mg, qPCR 12
FALSE NA TRUE Negative control qPCR 11
FALSE NA TRUE Positive control amplicon 91
FALSE NA TRUE Positive control mg 1
FALSE NA TRUE Positive control mg, qPCR 10
FALSE NA TRUE Positive control qPCR 12
FALSE NA TRUE Test sample qPCR 8
FALSE NA TRUE Test sample qPCR_daily 1
TRUE FALSE FALSE NA NA 414
TRUE FALSE TRUE Clinical sample mg, qPCR, amplicon 66
TRUE FALSE TRUE Clinical sample mg, qPCR, amplicon, luminex 5
TRUE FALSE TRUE Clinical sample qPCR, amplicon 1
TRUE TRUE FALSE NA NA 864
TRUE TRUE TRUE Clinical sample amplicon 4
TRUE TRUE TRUE Clinical sample flow 1
TRUE TRUE TRUE Clinical sample luminex, flow 1
TRUE TRUE TRUE Clinical sample mg, qPCR, amplicon 49
TRUE TRUE TRUE Clinical sample mg, qPCR, amplicon, flow 3
TRUE TRUE TRUE Clinical sample mg, qPCR, amplicon, luminex 491
TRUE TRUE TRUE Clinical sample mg, qPCR, amplicon, luminex, flow 346
TRUE TRUE TRUE Clinical sample mg, qPCR, luminex 2
TRUE TRUE TRUE Clinical sample mg, qPCR, qPCR_daily, amplicon, luminex, flow 2
TRUE TRUE TRUE Clinical sample qPCR, amplicon, luminex 1
TRUE TRUE TRUE Clinical sample qPCR, amplicon, luminex, flow 1
TRUE TRUE TRUE Clinical sample qPCR_daily, amplicon 2985
TRUE TRUE TRUE Clinical sample qPCR_daily, amplicon, luminex 1

Imputing study_day for daily visit_code that did not have CRF data

There were a few entries with daily visit_code that did not have a date in the CRF data. Consequently they also did not have study days (study_day) assigned.

Code
mae_coldata |> 
  filter(
    is.na(study_day), 
    is.na(sample_type) | (sample_type == "Clinical sample")
    ) |> 
  dplyr::count(visit_type, assays, in_CRF, visit_attended, sample_type)
# A tibble: 5 × 6
  visit_type assays               in_CRF visit_attended sample_type         n
  <chr>      <chr>                <lgl>  <lgl>          <chr>           <int>
1 Clinic     <NA>                 TRUE   FALSE          <NA>               38
2 Home       qPCR_daily, amplicon TRUE   FALSE          Clinical sample   100
3 Home       qPCR_daily, amplicon TRUE   TRUE           Clinical sample     3
4 Home       <NA>                 TRUE   FALSE          <NA>              540
5 Home       <NA>                 TRUE   TRUE           <NA>                7

We impute the missing values using the study_days from “surrounding” visits.

Code
# first, we create a "dictionary" with the most common study_days for each daily swab visit code
study_days_for_daily_codes <- 
  mae_coldata |> 
  dplyr::filter(visit_type == "Home") |> 
  dplyr::count(visit_code, study_day) |> 
  arrange(visit_code, -n) |> 
  group_by(visit_code) |> 
  slice_head(n = 1) |> 
  ungroup() |> 
  dplyr::rename(exp_study_day = study_day) |> 
  select(-n)
Code
study_days_for_daily_codes |> 
  ggplot(aes(x = exp_study_day, y = visit_code)) +
  geom_point()
Code
# In the plot above, we notice that visits 1501:1503 did not have "expected" study day, so we create it by expanding the time series
study_days_for_daily_codes <-
  study_days_for_daily_codes |> 
  mutate(
    exp_study_day = 
      case_when(
        is.na(exp_study_day) & (visit_code %in% c(1501:1503)) ~ 
          as.numeric(visit_code) - 1501 + 36,
        TRUE ~ exp_study_day
      )
  )
Code
# visits from the participants with missing study_days
pid_with_missing_daily_study_days <- 
  # we first filter to retrieve the pids
  mae_coldata |> 
  filter(is.na(study_day)) |>  # , in_assays
  select(pid) |> distinct() |> 
  # we join with the coldata
  left_join(mae_coldata, by = join_by(pid)) |> 
  arrange(pid, visit_code) |> 
  # we join the study_day dictionary
  left_join(study_days_for_daily_codes) |> 
  mutate(diff = study_day - exp_study_day) |> # the typical difference for a participant between their study_day and the "expected" study_day (some participant started sampling at home one day early or late)
  group_by(pid) |> 
  fill(diff, .direction = "downup") |>  # impute when missing
  mutate(diff = ifelse(all(is.na(diff)), 0, diff)) |> # if still missing,we set to 0
  ungroup() |> 
  # we fix the study_day
  mutate(
    fixed_study_day = 
      case_when(
        is.na(study_day) & !is.na(exp_study_day) ~ exp_study_day + diff,
        TRUE ~ study_day
      )
  ) |> 
  select(uid, pid, visit_code, study_day, fixed_study_day, exp_study_day, diff, in_CRF, in_assays, crf_plates, assays)

# pid_with_missing_daily_study_days |> View()
Code
mae_coldata <- 
  mae_coldata |> 
  left_join(
    pid_with_missing_daily_study_days |> select(uid, fixed_study_day), 
    by = join_by(uid)
    ) |> 
  mutate(
    study_day =
      case_when(
        is.na(study_day) ~ fixed_study_day,
        TRUE ~ study_day
      )
  ) |> 
  select(-fixed_study_day)
Code
mae_coldata |> 
  filter(is.na(study_day), in_assays) |> 
  View()

# all is fixed

Creating the visit, visit_number, and study_week columns

Code
visit_dict <- 
  bind_rows(
    tibble(
      visit = "Screening", 
      definition = "Screening visit for which we have the most assay data. If two screening visits have the same number of assays, we take the last one.", 
      possible_visit_codes = "0000, 0001, 0010, 0011",
      visit_number = "V0",
      study_week = -1
    ),
    tibble(
      visit = "Add. screening", 
      definition = 'If a participant has data for two screening visits, one is the "main" screening visit, and the other is labelled as an additional screening visit.', 
      possible_visit_codes = "0000, 0001, 0010, 0011",
      visit_number = "V0b",
      study_week = -2
      ),
    tibble(
      visit = "Day 0 (pre-MTZ)", 
      definition = "Visit before the metronidazole treatment (MTZ) starts.", 
      possible_visit_codes = "1000, 1011",
      visit_number = "V1",
      study_week = 0
      ),
    tibble(
      visit = "Week 1 (post-MTZ)", 
      definition = "Visit after the metronidazole treatment (MTZ) ends.", 
      possible_visit_codes = "1100",
      visit_number = "V2",
      study_week = 1
      ),
    tibble(
      visit = "Week 2 (post-SP)", 
      definition = "Visit after the study product (SP) ends.", 
      possible_visit_codes = "1200, 1211",
      visit_number = "V3",
      study_week = 2
      ),
    tibble(
      visit = "Week 3", 
      definition = "Visit one week after the study product (SP) ends.", 
      possible_visit_codes = "1300",
      visit_number = "V4",
      study_week = 3
      ),
    tibble(
      visit = "Week 4", 
      definition = "Visit two weeks after the study product (SP) ends.", 
      possible_visit_codes = "1400",
      visit_number = "V5",
      study_week = 4
      ),
    tibble(
      visit = "Week 5", 
      definition = "Visit three weeks after the study product (SP) ends.", 
      possible_visit_codes = "1500, 1511, 1512",
      visit_number = "V6",
      study_week = 5
      ),
    tibble(
      visit = "Week 7", 
      definition = "Visit five weeks after the study product (SP) ends.", 
      possible_visit_codes = "1700, 1711",
      visit_number = "V7",
      study_week = 7
      ),
    tibble(
      visit = "Week 9", 
      definition = "Visit seven weeks after the study product (SP) ends.", 
      possible_visit_codes = "1900, 1911",
      visit_number = "V8",
      study_week = 9
      ),
    tibble(
      visit = "Week 12", 
      definition = "Visit ten weeks after the study product (SP) ends.", 
      possible_visit_codes = "2120, 2121, 2122",
      visit_number = "V9",
      study_week = 12
    )
  ) |> 
  mutate(
    visit = visit |> fct_reorder(study_week)
  )
Code
visit_dict |> 
  gt() |> 
  tab_header(
    title = "Visit codes and definitions"
  ) |> 
  cols_label(
    visit = "Visit",
    definition = "Definition",
    possible_visit_codes = "Possible visit codes",
    visit_number = "Visit number",
    study_week = "Study week"
  ) |> 
  cols_align(
    align = "left", columns = c(visit, definition, possible_visit_codes)
  )
Visit codes and definitions
Visit Definition Possible visit codes Visit number Study week
Screening Screening visit for which we have the most assay data. If two screening visits have the same number of assays, we take the last one. 0000, 0001, 0010, 0011 V0 -1
Add. screening If a participant has data for two screening visits, one is the "main" screening visit, and the other is labelled as an additional screening visit. 0000, 0001, 0010, 0011 V0b -2
Day 0 (pre-MTZ) Visit before the metronidazole treatment (MTZ) starts. 1000, 1011 V1 0
Week 1 (post-MTZ) Visit after the metronidazole treatment (MTZ) ends. 1100 V2 1
Week 2 (post-SP) Visit after the study product (SP) ends. 1200, 1211 V3 2
Week 3 Visit one week after the study product (SP) ends. 1300 V4 3
Week 4 Visit two weeks after the study product (SP) ends. 1400 V5 4
Week 5 Visit three weeks after the study product (SP) ends. 1500, 1511, 1512 V6 5
Week 7 Visit five weeks after the study product (SP) ends. 1700, 1711 V7 7
Week 9 Visit seven weeks after the study product (SP) ends. 1900, 1911 V8 9
Week 12 Visit ten weeks after the study product (SP) ends. 2120, 2121, 2122 V9 12

We first determine, for each participant, which is their main vs. their additional screening visit.

Code
mae_coldata <- 
  mae_coldata |> 
  group_by(pid) |> 
  mutate(
    is_screening_visit = 
      case_when(
        visit_code %in% c("0000", "0001", "0010", "0011") ~ TRUE,
        TRUE ~ FALSE
      ),
    n_assays = ifelse(in_assays, str_split(assays, ", ") |> lengths(), 0),
    screening_visit_score = is_screening_visit * (n_assays + study_day/10)
  ) |> 
  arrange(-screening_visit_score) |> 
  mutate(
    screening_visit = 
      case_when(
        is_screening_visit & (row_number() == 1) ~ "main screening",
        is_screening_visit & (row_number() > 1) ~ "additional screening",
        TRUE ~ NA_character_
      )
  ) |> 
  ungroup() |> 
  select(-is_screening_visit, -n_assays, -screening_visit_score)
Code
mae_coldata |> 
  dplyr::filter(screening_visit == "additional screening") |> 
  select(pid) |> distinct() |> 
  left_join(mae_coldata) |> 
  arrange(randomized |> desc(), pid, visit_code) |> 
  select(randomized, pid, visit_code, study_day, crf_plates, assays, screening_visit) |> 
  View()

mae_coldata |> 
  dplyr::filter(screening_visit == "additional screening") |> 
  select(pid) |> distinct() |> 
  left_join(mae_coldata) |> 
  arrange(randomized |> desc(), pid, visit_code) |> 
  select(randomized, pid, visit_code, study_day, crf_plates, assays, screening_visit) |> 
  dplyr::filter(randomized, (screening_visit == "main screening"), study_day == 0) |> 
  select(pid) |> distinct() |> 
  left_join(mae_coldata) |> 
  arrange(randomized |> desc(), pid, visit_code) |> 
  select(randomized, mITT, pid, visit_code, study_day, crf_plates, assays, screening_visit) |> 
  View()


# > check 068200020
# unclear - her main screening visit could be at day -3, but it might be at day 0, which would mean that her screening visit and pre-MTZ visit samples were collected on the same day
crf_raw$crf35 |> dplyr::filter(pid == "068200020") |> View()
Code
mae_coldata |> 
  filter(!is.na(screening_visit)) |> 
  dplyr::count(screening_visit) 
# A tibble: 2 × 2
  screening_visit          n
  <chr>                <int>
1 additional screening    66
2 main screening         532

We next join the mae_coldata table with the visit_dict table to add the visit, visit_number, and study_week columns.

Code
mae_coldata <- 
  mae_coldata |> 
  select(-any_of(c("PIPV", "visit", "visit_number", "study_week"))) |>
  left_join(
    visit_dict |> 
      mutate(visit_code = possible_visit_codes |> str_split(", ")) |>
      unnest(visit_code) |>
      mutate(
        screening_visit = 
          case_when(
            visit_number == "V0b" ~ "additional screening",
            visit_number == "V0" ~ "main screening",
            TRUE ~ NA_character_
            ),
        PIPV = visit_number != "V0b"
        ) |> 
      select(visit_code, screening_visit, PIPV, visit, visit_number, study_week)
  ) |> 
  mutate(PIPV = PIPV |> replace_na(FALSE)) |> 
  relocate(
    PIPV, visit, visit_number, study_week, .after = "visit_code"
  ) 

We check that study_days are unique for each visit_number at which assay samples were collected for each participant.

Code
mae_coldata |> 
  group_by(pid, visit_number) |> 
  mutate(n_study_day = n_distinct(study_day[!is.na(assays)])) |>
  ungroup() |> 
  filter(PIPV, n_study_day > 1) |> 
  select(uid, pid, visit_code, study_day, assays) |> 
  group_by(pid) |> 
  gt(row_group_as_column = TRUE)
uid visit_code study_day assays
068200062 068200062_1500 1500 37 mg, qPCR, amplicon
068200062_1511 1511 44 luminex, flow
068200087 068200087_1500 1500 40 mg, qPCR, amplicon, luminex
068200087_1511 1511 43 flow

There are just two participants for which the dates don’t match and the different assays were collected at different dates. This may be important for integrative analyses later, so we leave as is for now.

Creating an assay to document specimen collection, and available data

  • specimen_collection_swab (collected, not collected, unclear)
  • specimen_collection_softcup (collected, not collected, unclear)
  • specimen_collection_cytobrush (collected, not collected, unclear)
  • mg
  • qPCR
  • amplicon
  • luminex
  • flow
Code
sample_and_visit_info_assay <- 
  mae_coldata |> 
  select(uid, sample_type, in_CRF, in_assays, randomized) 

sample_and_visit_info_assay <- 
  sample_and_visit_info_assay |> 
  dplyr::left_join(
    visits |> select(uid, starts_with("specimen_collection")),
    by = join_by(uid)
  ) 

sample_and_visit_info_assay <-
  sample_and_visit_info_assay |>
  expand_grid(assay = assay_list$mae_assay_name) |>
  dplyr::left_join(
    available_samples |>
      select(uid, assay) |>
      mutate(value = TRUE),
    by = join_by(uid, assay)
  ) |>
  mutate(value = value |> replace_na(FALSE)) |>
  pivot_wider(names_from = assay, values_from = value)

# sample_and_visit_info_assay <- 
#   sample_and_visit_info_assay |> 
#   mutate(
#     mg_category = 
#       case_when(
#         !str_detect(sample_type, "Clinical sample") & mg ~ "Not a clinical sample",
#         is.na(randomized) & mg ~ "???",
#         !is.na(randomized) & !randomized & mg ~ "Additional sample from non-randomized participants",
#         !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & mg ~ "Expected data",
#         (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & mg ~ "Unexpected data",
#         !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & !mg ~ "Missing data",
#         (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & !mg ~ "Sample not collected",
#         TRUE ~ "ERROR"
#       ),
#     qPCR_category = 
#       case_when(
#         !str_detect(sample_type, "Clinical sample") & qPCR ~ "Not a clinical sample",
#         is.na(randomized) & qPCR ~ "???",
#         !is.na(randomized) & !randomized & qPCR ~ "Additional sample from non-randomized participants",
#         !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & qPCR ~ "Expected data",
#         (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & qPCR ~ "Unexpected data",
#         !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & !qPCR ~ "Missing data",
#         (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & !qPCR ~ "Sample not collected",
#         TRUE ~ "ERROR"
#       ),
#     amplicon_category = 
#       case_when(
#         !str_detect(sample_type, "Clinical sample") & amplicon ~ "Not a clinical sample",
#         is.na(randomized) & amplicon ~ "???",
#         !is.na(randomized) & !randomized & amplicon ~ "Additional sample from non-randomized participants",
#         !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & amplicon ~ "Expected data",
#         (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & amplicon ~ "Unexpected data",
#         !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & !amplicon ~ "Missing data",
#         (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & !amplicon ~ "Sample not collected",
#         TRUE ~ "ERROR"
#       ),
#     luminex_category = 
#       case_when(
#         !str_detect(sample_type, "Clinical sample") & luminex ~ "Not a clinical sample",
#         is.na(randomized) & luminex ~ "???",
#         !is.na(randomized) & !randomized & luminex ~ "Additional sample from non-randomized participants",
#         !is.na(specimen_collection_softcup) & (specimen_collection_softcup != "not collected") & luminex ~ "Expected data",
#         (is.na(specimen_collection_softcup) | (specimen_collection_softcup == "not collected")) & luminex ~ "Unexpected data",
#         !is.na(specimen_collection_softcup) & (specimen_collection_softcup == "collected") & !luminex ~ "Missing data",
#         (is.na(specimen_collection_softcup) | (specimen_collection_softcup == "not collected")) & !luminex ~ "Sample not collected",
#         TRUE ~ "ERROR"
#       )
#   )

# 
# The last 5 columns can take the following values:
# 
# - `expected` (swab collected and available data), 
# - `unexpected` (swab not collected but available data), 
# - `missing` (swab collected but no available data),
# - `maybe missing` (unclear if swab was collected and no available data),
# - `not collected` (swab not collected), 
# - `additional` (swab collected from a non-randomized participant), 
# - `NA` (not collected, no available data)
# 
Code
se_sample_and_visit_info_assay <- 
  SummarizedExperiment(
    assays = list(
      sample_and_visit_info = 
        sample_and_visit_info_assay |> 
        select(-c(sample_type, in_CRF, in_assays, randomized)) |> 
        as.data.frame() |> 
        column_to_rownames("uid") |> 
        t()
    ),
    colData = 
      sample_and_visit_info_assay |> select(uid) |> 
      mutate(rownames = uid) |> as.data.frame() |> 
      column_to_rownames("rownames")
  )

Creating the MultiAssayExperiment object

Code
SE_list <- 
  list(
    visit_and_sample_info = se_sample_and_visit_info_assay,
    mg = se_mg,
    qPCR = se_qPRC,
    qPCR_daily = se_daily_qPRC,
    amplicon = se_16S,
    luminex = se_luminex,
    flow = se_flow
  )


mae <- 
  MultiAssayExperiment::MultiAssayExperiment(
    experiments = MultiAssayExperiment::ExperimentList(SE_list),
    colData = 
      mae_coldata |> select(-in_CRF, -in_assays) |> 
      as.data.frame() |> mutate(rownames = uid) |> 
      column_to_rownames("rownames"),
    metadata = list(
      study = "VIBRANT",
      data_source = "actual study data (not simulated)",
      data_status = "partially QCed",
      date = today(),
      colData_dictionary = participants_variable_dictionary,
      crf_plates_description = crf_plates_dictionary,
      crf_data_raw = crf_raw,
      crf_data_clean = crf_clean, 
      participant_crfs_merged = participant_crfs_merged,
      visits_crfs_merged = visits_crfs_merged,
      exposures = exposures
    )
  )
Code
mae
A MultiAssayExperiment object of 7 listed
 experiments with user-defined names and respective classes.
 Containing an ExperimentList class object of length 7:
 [1] visit_and_sample_info: SummarizedExperiment with 9 rows and 5463 columns
 [2] mg: SummarizedExperiment with 779 rows and 996 columns
 [3] qPCR: SummarizedExperiment with 16 rows and 1029 columns
 [4] qPCR_daily: SummarizedExperiment with 16 rows and 2990 columns
 [5] amplicon: SummarizedExperiment with 817 rows and 4119 columns
 [6] luminex: SummarizedExperiment with 48 rows and 850 columns
 [7] flow: SummarizedExperiment with 11 rows and 356 columns
Functionality:
 experiments() - obtain the ExperimentList instance
 colData() - the primary/phenotype DataFrame
 sampleMap() - the sample coordination DataFrame
 `$`, `[`, `[[` - extract colData columns, subset, or experiment
 *Format() - convert into a long or wide DataFrame
 assays() - convert ExperimentList to a SimpleList of matrices
 exportClass() - save data to flat files

Available assays

Code
mae_sample_map <- 
  mae@sampleMap |> as_tibble() |> dplyr::filter(assay %in% assay_list$mae_assay_name) |>
  select(uid = primary, assay) |> 
  arrange(uid) |> 
  dplyr::left_join(assay_list |> dplyr::rename(assay = mae_assay_name), by = join_by(assay)) |> 
  dplyr::right_join(
    mae@colData |> as.data.frame() |> as_tibble() |> 
      select(uid, pid, visit_code, PIPV, visit, visit_type, sample_type, location, randomized, arm), 
    by = join_by(uid)
    )

All samples

Code
mae_sample_map |> 
  dplyr::filter(!is.na(assay)) |> 
  group_by(uid) |> mutate(n = n()) |> ungroup() |> 
  mutate(uid = uid |> fct_reorder(-n)) |>
  ggplot() +
  aes(x = uid, y = assay_label |> fct_rev(), fill = assay_label) +
  geom_tile() +
  facet_grid(. ~ sample_type, scales = "free_x", space = "free_x") +
  scale_x_discrete("Sample IDs", breaks = NULL) +
  ylab("") +
  scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.x = element_text(angle = 90),
    legend.position = "bottom"
    )

Longitudinal availability for all visits

Code
mae_sample_map |> 
  dplyr::filter(!is.na(assay)) |> 
  # filter(visit_code %in% c("0000", seq(1000, 1900, by = 100), "2120")) |> 
  ggplot() +
  aes(x = assay_label, y = pid, fill = assay_label) +
  geom_tile() +
  facet_grid(
    ifelse(randomized, "Randomized", "Not randomized") + location ~ visit_code, 
    scales = "free_y", space = "free_y"
    ) + # sample_category
  scale_y_discrete("Participants IDs") + #, breaks = "NULL")  +
  xlab("") +
  scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
  theme(
    axis.text.x = element_blank(), #element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0),
    strip.text.x = element_text(angle = 90),
    legend.position = "bottom"
    )

For the weekly visits of all participants

Code
mae_sample_map |> 
  dplyr::filter(!is.na(assay)) |> 
  dplyr::filter(visit_type == "Clinic") |> 
  ggplot() +
  aes(x = assay_label, y = pid, fill = assay_label) +
  geom_tile() +
  facet_grid(
    location + ifelse(randomized, "Randomized", "Not randomized")  + arm  ~ visit_code, 
    scales = "free_y", space = "free_y"
    ) + # sample_category
  scale_y_discrete("Participants IDs") + #, breaks = "NULL")  +
  xlab("") +
  scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
  theme(
    axis.text.x = element_blank(), #element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0),
    strip.text.x = element_text(angle = 90),
    legend.position = "bottom"
    )

For the weekly visits of randomized participants

Code
mae_sample_map |> 
  dplyr::filter(randomized) |> 
  dplyr::filter(!is.na(assay)) |> 
  dplyr::filter(visit_type == "Clinic") |> 
  ggplot() +
  aes(x = assay_label, y = pid, fill = assay_label) +
  geom_tile() +
  facet_grid(
    location + ifelse(randomized, "Randomized", "Not randomized")  + arm  ~ visit_code, 
    scales = "free_y", space = "free_y"
    ) + # sample_category
  scale_y_discrete("Participants IDs") + #, breaks = "NULL")  +
  xlab("") +
  scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
  theme(
    axis.text.x = element_blank(), #element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0),
    strip.text.x = element_text(angle = 90),
    legend.position = "bottom"
    )

Code
mae_sample_map |> 
  dplyr::filter(randomized) |> 
  dplyr::filter(!is.na(assay)) |> 
  dplyr::filter(PIPV) |> 
  ggplot() +
  aes(x = assay_label, y = pid, fill = assay_label) +
  geom_tile() +
  facet_grid(
    location + arm  ~ visit, 
    scales = "free_y", space = "free_y"
    ) + # sample_category
  scale_y_discrete("Participants IDs") + #, breaks = "NULL")  +
  xlab("") +
  scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
  theme(
    axis.text.x = element_blank(), #element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0),
    strip.text.x = element_text(angle = 90),
    legend.position = "bottom"
    )

Saving the MultiAssayExperiment objects

We export the full MultiAssayExperiment object as an .rds file for further analyses.

Code
saveRDS(
  mae, 
  str_c(
    get_02_output_dir(),
    "mae_full_", today() |> str_remove_all("-"), ".rds"
  )
)